{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 查看\n",
    "上一章讲到了控制DataFrame显示的一些参数，本章则具体讲解一下如何获得对DataFrame的整体认知。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "E:\\ML\\实战\\pandas实用教程\n"
     ]
    }
   ],
   "source": [
    "!cd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. `info()`\n",
    "这是DataFrame才有的API。\n",
    "#### `DataFrame.info(verbose=None, memory_usage=True null_counts=True)`\n",
    "- verbose：True or False，字面意思是冗长的，也就说如何DataFrame有很多列，是否显示所有列的信息，如果为否，那么会省略一部分；\n",
    "- memory_usage：True or False，默认为True，是否查看DataFrame的内存使用情况；\n",
    "- null_counts：True or False，默认为True，是否统计NaN值的个数。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>90</th>\n",
       "      <th>91</th>\n",
       "      <th>92</th>\n",
       "      <th>93</th>\n",
       "      <th>94</th>\n",
       "      <th>95</th>\n",
       "      <th>96</th>\n",
       "      <th>97</th>\n",
       "      <th>98</th>\n",
       "      <th>99</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>0 rows × 100 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]\n",
       "Index: []\n",
       "\n",
       "[0 rows x 100 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame( columns = range(0,100))\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 0 entries\n",
      "Data columns (total 100 columns):\n",
      "0     0 non-null object\n",
      "1     0 non-null object\n",
      "2     0 non-null object\n",
      "3     0 non-null object\n",
      "4     0 non-null object\n",
      "5     0 non-null object\n",
      "6     0 non-null object\n",
      "7     0 non-null object\n",
      "8     0 non-null object\n",
      "9     0 non-null object\n",
      "10    0 non-null object\n",
      "11    0 non-null object\n",
      "12    0 non-null object\n",
      "13    0 non-null object\n",
      "14    0 non-null object\n",
      "15    0 non-null object\n",
      "16    0 non-null object\n",
      "17    0 non-null object\n",
      "18    0 non-null object\n",
      "19    0 non-null object\n",
      "20    0 non-null object\n",
      "21    0 non-null object\n",
      "22    0 non-null object\n",
      "23    0 non-null object\n",
      "24    0 non-null object\n",
      "25    0 non-null object\n",
      "26    0 non-null object\n",
      "27    0 non-null object\n",
      "28    0 non-null object\n",
      "29    0 non-null object\n",
      "30    0 non-null object\n",
      "31    0 non-null object\n",
      "32    0 non-null object\n",
      "33    0 non-null object\n",
      "34    0 non-null object\n",
      "35    0 non-null object\n",
      "36    0 non-null object\n",
      "37    0 non-null object\n",
      "38    0 non-null object\n",
      "39    0 non-null object\n",
      "40    0 non-null object\n",
      "41    0 non-null object\n",
      "42    0 non-null object\n",
      "43    0 non-null object\n",
      "44    0 non-null object\n",
      "45    0 non-null object\n",
      "46    0 non-null object\n",
      "47    0 non-null object\n",
      "48    0 non-null object\n",
      "49    0 non-null object\n",
      "50    0 non-null object\n",
      "51    0 non-null object\n",
      "52    0 non-null object\n",
      "53    0 non-null object\n",
      "54    0 non-null object\n",
      "55    0 non-null object\n",
      "56    0 non-null object\n",
      "57    0 non-null object\n",
      "58    0 non-null object\n",
      "59    0 non-null object\n",
      "60    0 non-null object\n",
      "61    0 non-null object\n",
      "62    0 non-null object\n",
      "63    0 non-null object\n",
      "64    0 non-null object\n",
      "65    0 non-null object\n",
      "66    0 non-null object\n",
      "67    0 non-null object\n",
      "68    0 non-null object\n",
      "69    0 non-null object\n",
      "70    0 non-null object\n",
      "71    0 non-null object\n",
      "72    0 non-null object\n",
      "73    0 non-null object\n",
      "74    0 non-null object\n",
      "75    0 non-null object\n",
      "76    0 non-null object\n",
      "77    0 non-null object\n",
      "78    0 non-null object\n",
      "79    0 non-null object\n",
      "80    0 non-null object\n",
      "81    0 non-null object\n",
      "82    0 non-null object\n",
      "83    0 non-null object\n",
      "84    0 non-null object\n",
      "85    0 non-null object\n",
      "86    0 non-null object\n",
      "87    0 non-null object\n",
      "88    0 non-null object\n",
      "89    0 non-null object\n",
      "90    0 non-null object\n",
      "91    0 non-null object\n",
      "92    0 non-null object\n",
      "93    0 non-null object\n",
      "94    0 non-null object\n",
      "95    0 non-null object\n",
      "96    0 non-null object\n",
      "97    0 non-null object\n",
      "98    0 non-null object\n",
      "99    0 non-null object\n",
      "dtypes: object(100)\n",
      "memory usage: 0.0+ bytes\n"
     ]
    }
   ],
   "source": [
    "df.info()  # 直接默认设置即可，快捷查看多种信息：总行数和列数、每列元素类型和NaN的个数，总内存"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. `ndim, shape, size`\n",
    "这三个属性关系密切"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0  1\n",
       "0 NaN  2\n",
       "1 NaN  3"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame( [[np.nan, 2],[np.nan, 3]])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.ndim  # 返回维度数，Series一维，DataFrame两维，平时很少用到，不过有时会在循环中用到"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2, 2)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape  # (行数，列数)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.size   # 元素个数，rows×cols"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. `head(), tail()`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### `Series/DataFrame.head(n=5)`\n",
    "#### `Series/DataFrame.tail(n=5)`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0\n",
       "1    1\n",
       "2    2\n",
       "3    3\n",
       "4    4\n",
       "dtype: int32"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = pd.Series( range(0,5))\n",
    "s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0\n",
       "1    1\n",
       "2    2\n",
       "dtype: int32"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2    2\n",
       "3    3\n",
       "4    4\n",
       "dtype: int32"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.tail(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4. 个数统计"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.1 `value_counts()`\n",
    "只适合序列。\n",
    "#### `Series/Index.value_counts(normalize=False, ascending=False, bins=None)`\n",
    "- normalize：True or False，计算频次还是频率比；\n",
    "- ascending：True or False，排序方式，默认降序；\n",
    "- bins：int，pd.cut的一种快捷操作，对连续数值型效果好；"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1\n",
       "1    2\n",
       "2    1\n",
       "3    2\n",
       "4    1\n",
       "5    3\n",
       "dtype: int64"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = pd.Series([1,2,1,2,1,3])\n",
    "s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    3\n",
       "2    2\n",
       "3    1\n",
       "dtype: int64"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "s.value_counts(ascending = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.997, 2.0]    5\n",
       "(2.0, 3.0]      1\n",
       "dtype: int64"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.value_counts( bins = 2)   # bins按照int平均分割，左开右闭，左侧外延1%以包含最左值"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "set_option  max_rows,max_columns,max_colwidth, precision\n",
    "\n",
    "head\n",
    "tail\n",
    "info\n",
    "dtypes\n",
    "value_counts\n",
    "memory_usage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "E:\\ML\\实战\\pandas实用教程\n"
     ]
    }
   ],
   "source": [
    "!cd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(60, 20, 50)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.get_option('display.max_rows'),pd.get_option('display.max_columns'),pd.get_option('display.max_colwidth')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'right'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.get_option('display.colheader_justify')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "统计学\n",
    "sample\n",
    "rolling\n",
    "min,max"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "缺失值和异常值处理\n",
    "1. 获取\n",
    "布尔操作\n",
    "& | ~ isnull(),notnull(), isin(), where\n",
    "2. 修改"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "去重"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2rc2"
  },
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "navigate_num": "#000000",
    "navigate_text": "#333333",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700",
    "sidebar_border": "#EEEEEE",
    "wrapper_background": "#FFFFFF"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "141px",
    "width": "253px"
   },
   "navigate_menu": true,
   "number_sections": false,
   "sideBar": true,
   "threshold": "3",
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": true,
   "widenNotebook": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
